import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import itertools
import collections
from datetime import date
from sklearn.decomposition import PCA
from mpl_toolkits.mplot3d import Axes3D
from sklearn import mixture
from sklearn.metrics import silhouette_score
from sklearn.cluster import KMeans
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
from wordcloud import WordCloud
%matplotlib inline
anime = pd.read_csv('Animelist.csv')
users = pd.read_csv('users_cleaned.csv')
anime_list = pd.read_csv('animelists_filtered.csv')
#### Preparação da Tabela para o Modelo
#### Implementar modelo 1 - GMM - Gaussiam Mixture:
- Descobrir melhor número de Klusters
- Implementar Modelo
- Descobrir melhor número de Klusters
- Implementar Modelo
- Escolher melhor modelo
- Retirar Informações de cada Kluster
#### Preparação da Tabela para o Modelo
#### Implementar modelo 1 - GMM - Gaussiam Mixture:
- Descobrir melhor número de Klusters
- Implementar Modelo
- Descobrir melhor número de Klusters
- Implementar Modelo
- Escolher melhor modelo
- Retirar Informações de cada Kluster
- Foram retiradas dados nulos
- Foram retiradas usuários que deram notas a menos de 20 animes e mais de 300;
- Coluna birth_date alterada para age, onde possui a idade do usuário
- Colunas que não irão ser utilizadas no modelo foram retiradas.
- A coluna location, embora eu queira usar, ela não possui dados confiaveis, pois o usuário pode preencher com qualquer string.
users.head()
users.isnull().sum()
users = users.drop(['access_rank'], axis= 1)
users = users.dropna()
users['username'].count()
def calculate_age(born):
today = date.today()
return today.year - born.year - ((today.month, today.day) < (born.month, born.day))
users['users_age'] = pd.to_datetime(users['birth_date'])
users['age'] = users['users_age'].apply(calculate_age)
users = users.drop(['users_age', 'birth_date'], axis= 1)
users = users.drop(['user_watching' , 'user_completed', 'user_onhold', 'user_dropped',
'user_plantowatch', 'user_days_spent_watching', 'join_date',
'last_online', 'stats_rewatched', 'stats_episodes','location'], axis= 1)
users.head(10)
sns.barplot(y=users['stats_mean_score'], x =users['gender'], estimator = np.median )
sns.barplot(y=users['age'], x =users['gender'], estimator = np.median )
sns.barplot(y=users['stats_mean_score'], x =users['age'] )
- Foram retiradas dados nulos
- Foram retiradas usuários que deram notas a menos de 20 animes e mais de 300;
- Usuários que assistiram 0 episodios;
- Colunas que não irão ser utilizadas no modelo foram retiradas.
anime_list.head()
anime_list = anime_list[anime_list['username'].isin(users['username'])]
anime_list.isnull().sum()
anime_list.my_score.describe()
anime_list[anime_list['my_watched_episodes']==0].my_watched_episodes.count()
anime_list[anime_list['my_watched_episodes']==0].my_score.describe()
anime_list = anime_list.drop(anime_list[anime_list.my_watched_episodes == 0].index)
anime_list = anime_list[anime_list['username'].notnull()]
add_title_anime_id = pd.DataFrame(
{'anime_id': anime['anime_id'],
'title': anime['title']
})
add_username_user_id = pd.DataFrame(
{'user_id': users['user_id'],
'username': users['username']
})
anime_list['username'].count()
anime_list = pd.merge(anime_list,add_title_anime_id,on=['anime_id','anime_id'])
anime_list = pd.merge(anime_list,add_username_user_id,on=['username','username'])
anime_list['username'].count()
users_reduced = anime_list.groupby('username')['username'].count() < 20
users_reduced_2 = anime_list.groupby('username')['username'].count() > 300
size = len(anime_list.groupby('username'))
drop_users = []
for i in range(0,size) :
if users_reduced[i] == True:
drop_users.append(users_reduced.index[i])
elif users_reduced_2[i] == True:
drop_users.append(users_reduced_2.index[i])
users = users[~users['username'].isin(drop_users)]
anime_list = anime_list[~anime_list['username'].isin(drop_users)]
anime_list['username'].count()
aa = anime_list.groupby('username').count()
aa.anime_id.describe()
anime_list = anime_list.drop(['my_start_date', 'my_finish_date', 'my_rewatching',
'my_rewatching_ep', 'my_last_updated', 'my_tags',
'my_status', 'my_watched_episodes'], axis = 1)
anime_list.head()
- Foram retiradas dados nulos
- Retirar Gender NaN
- Retirar Rating NaN
- Retirar Type = Unknown
- Foram retiradas animes que, na época, ainda não tinha lançado;
- Foram retiradas animes que receberam notas de 0 pessoas.
- Devido ao grande número de dados nulos de producer e studio, não será utilizado as colunas no modelo, pois a outra opção seria retiras os dados nulos que seria 40% da tabela a menos.
- Colunas que não irão ser utilizadas no modelo foram retiradas.
anime.head()
anime.describe()
anime.isnull().sum()
anime = anime[anime['genre'].notnull()]
anime = anime.drop(['studio', 'producer'], axis=1)
not_aired = anime[anime['status'] == 'Not yet aired']
not_aired
not_aired['anime_id'].count()
anime[anime['rating'] == 'None']['anime_id'].count()
anime[anime['type'] == 'Unknown']['anime_id'].count()
anime[anime['source'] == 'Unknown']['anime_id'].count()
anime = anime[anime['rating'] != 'None']
anime = anime[anime['type'] != 'Unknown']
anime = anime[anime['source'] != 'Unknown']
anime = anime[anime['status'] != 'Not yet aired']
anime = anime.drop(['title_english', 'title_japanese', 'title_synonyms', 'image_url','broadcast',
'related', 'licensor', 'opening_theme', 'ending_theme', 'background', 'premiered',
'aired_string', 'aired','airing', 'duration','rank','popularity', 'status'], axis=1)
anime.describe()
anime["genre"] = anime["genre"].apply(lambda x: x.replace(' ', ''))
anime.head()
sns.barplot(y=anime['rating'], x =anime['members'])
sns.boxplot(data=anime,x='score',y='rating')
sns.boxplot(data=anime,x='type',y='score')
sns.pairplot(data=anime[['rating','members','score','scored_by', 'favorites', 'episodes']],hue='rating')
sns.heatmap(anime.corr(),annot=True)
Filtro Demográfico
### Preparação da Tabela para o Modelo
anime.head()
anime["genre"] = anime["genre"].apply(lambda x: x.replace(' ', ''))
anime_features = pd.concat([anime["genre"].str.get_dummies(sep=",").replace(' ', ''),
pd.get_dummies(anime[["type"]]),
pd.get_dummies(anime[["source"]]),
pd.get_dummies(anime[["rating"]]),
anime[["score"]],anime[["scored_by"]],anime[["favorites"]],
anime[["members"]],anime["episodes"]],axis=1)
anime_features.head()
anime_features.columns
min_max_scaler = MinMaxScaler()
anime_features_scaled = min_max_scaler.fit_transform(anime_features)
anime_pca = PCA(n_components=3)
anime_pca.fit(anime_features_scaled)
anime_pca_samples = anime_pca.transform(anime_features_scaled)
anime_ps = pd.DataFrame(anime_pca_samples)
anime_ps.head(15)
anime_tocluster = pd.DataFrame(anime_ps[[0,1,2]])
plt.rcParams['figure.figsize'] = (16, 9)
fig = plt.figure()
ax = Axes3D(fig)
ax.scatter(anime_tocluster[0], anime_tocluster[2], anime_tocluster[1])
plt.title('Data points in 3D PCA axis', fontsize=20)
plt.show()
anime_GMM_results = []
components = 20
for n in range(2, components):
anime_GMM_clusterer = mixture.GaussianMixture(n_components=n).fit(anime_ps)
anime_GMM_preds = anime_GMM_clusterer.predict(anime_ps)
anime_GMM_centers = anime_GMM_clusterer.means_
anime_GMM_sample_preds = anime_GMM_clusterer.predict(anime_pca_samples)
anime_GMM_results.append(silhouette_score(anime_ps,anime_GMM_preds))
print("Nº of clusters '{}', silhouette score is '{}' ".format(n,silhouette_score(anime_ps,anime_GMM_preds)))
plt.plot(range(2,20), anime_GMM_results);
plt.title('Results GMM')
plt.xlabel('n_clusters');
plt.axvline(x=7, color='blue', linestyle='--')
plt.axvline(x=5, color='blue', linestyle='--')
plt.ylabel('Silhouette Score');
plt.show()
anime_GMM_results_BIC = []
components = 20
for n in range(2, components):
anime_GMM_clusterer = mixture.GaussianMixture(n_components=n).fit(anime_ps)
anime_GMM_preds = anime_GMM_clusterer.predict(anime_ps)
anime_GMM_centers = anime_GMM_clusterer.means_
anime_GMM_sample_preds = anime_GMM_clusterer.predict(anime_pca_samples)
anime_GMM_results_BIC.append(anime_GMM_clusterer.bic(anime_ps))
print("Nº of clusters '{}', BIC score is '{}' ".format(n,anime_GMM_clusterer.bic(anime_ps)))
plt.plot(range(2,20), anime_GMM_results_BIC);
plt.title('Results GMM')
plt.xlabel('n_clusters');
plt.axvline(x=5, color='blue', linestyle='--')
plt.axvline(x=4, color='blue', linestyle='--')
plt.ylabel('BIC Score');
plt.show()
anime_GMM_clusterer = mixture.GaussianMixture(n_components=5).fit(anime_ps)
anime_GMM_preds = anime_GMM_clusterer.predict(anime_ps)
anime_GMM_centers = anime_GMM_clusterer.means_
anime_GMM_sample_preds = anime_GMM_clusterer.predict(anime_pca_samples)
plt.rcParams['figure.figsize'] = (16, 9)
fig = plt.figure()
ax = Axes3D(fig)
ax.scatter(anime_tocluster[0], anime_tocluster[2], anime_tocluster[1], c = anime_GMM_sample_preds)
plt.title('Data points in 3D PCA axis', fontsize=20)
plt.show()
fig = plt.figure(figsize=(10,8))
plt.scatter(anime_tocluster[1],anime_tocluster[0],c = anime_GMM_sample_preds)
for ci,c in enumerate(anime_GMM_centers):
plt.plot(c[1], c[0], 'o', markersize=8, color='red', alpha=1)
plt.xlabel('x_values')
plt.ylabel('y_values')
plt.title('Data points in 2D PCA axis', fontsize=20)
plt.show()
anime_kmean_results = []
for i in range(2,20):
anime_kmeans = KMeans(n_clusters=i)
anime_kmeans.fit(anime_ps)
anime_kmean_results.append(silhouette_score(anime_ps, anime_kmeans.labels_))
print("Nº of clusters '{}', silhouette score is '{}' ".format(i,silhouette_score(anime_ps, anime_kmeans.labels_)))
plt.plot(range(2,20), anime_kmean_results);
plt.title('Results KMeans')
plt.xlabel('n_clusters');
plt.axvline(x=4, color='blue', linestyle='--')
plt.axvline(x=7, color='blue', linestyle='--')
plt.ylabel('Silhouette Score');
plt.show()
anime_kmean_results_elbow = []
for i in range(2,20):
anime_kmeans = KMeans(n_clusters=i)
anime_kmeans.fit(anime_ps)
anime_kmean_results_elbow.append(anime_kmeans.inertia_)
print("Nº of clusters '{}', Elbow Method is '{}' ".format(i,anime_kmeans.inertia_))
plt.plot(range(2,20), anime_kmean_results_elbow);
plt.title('Results KMeans')
plt.xlabel('n_clusters');
plt.axvline(x=4, color='blue', linestyle='--')
plt.axvline(x=7, color='blue', linestyle='--')
plt.ylabel('Elbow Method');
plt.show()
anime_kmean_clusterer = KMeans(n_clusters=7,random_state=30).fit(anime_tocluster)
anime_kmean_centers = anime_kmean_clusterer.cluster_centers_
anime_kmean_sample_preds = anime_kmean_clusterer.predict(anime_tocluster)
plt.rcParams['figure.figsize'] = (16, 9)
fig = plt.figure()
ax = Axes3D(fig)
ax.scatter(anime_tocluster[0], anime_tocluster[2], anime_tocluster[1], c = anime_kmean_sample_preds)
plt.title('Data points in 3D PCA axis', fontsize=20)
plt.show()
fig = plt.figure(figsize=(10,8))
plt.scatter(anime_tocluster[1],anime_tocluster[0],c = anime_kmean_sample_preds)
for ci,c in enumerate(anime_kmean_centers):
plt.plot(c[1], c[0], 'o', markersize=8, color='red', alpha=1)
plt.xlabel('x_values')
plt.ylabel('y_values')
plt.title('Data points in 2D PCA axis', fontsize=20)
plt.show()
anime_features['cluster'] = anime_kmean_sample_preds
anime['cluster'] = anime_kmean_sample_preds
cluster_members_anime_series = anime_features.groupby('cluster')['members'].mean().reset_index()
cluster_members_anime = pd.DataFrame(data = cluster_members_anime_series)
cluster_members_anime = cluster_members_anime.sort_values("members")
sns.barplot(y=cluster_members_anime['members'],x=cluster_members_anime['cluster'])
cluster_favorites_anime_series = anime_features.groupby('cluster')['favorites'].mean().reset_index()
cluster_favorites_anime = pd.DataFrame(data = cluster_favorites_anime_series)
cluster_favorites_anime = cluster_favorites_anime.sort_values("favorites")
sns.barplot(y=cluster_favorites_anime['favorites'],x=cluster_favorites_anime['cluster'])
cluster_score_anime_series = anime_features.groupby('cluster')['score'].mean().reset_index()
cluster_score_anime = pd.DataFrame(data = cluster_score_anime_series)
cluster_score_anime = cluster_score_anime.sort_values("score")
sns.barplot(y=cluster_score_anime['score'],x=cluster_score_anime['cluster'])
cluster_episodes_anime_series = anime_features.groupby('cluster')['episodes'].mean().reset_index()
cluster_episodes_anime = pd.DataFrame(data = cluster_episodes_anime_series)
cluster_episodes_anime = cluster_episodes_anime.sort_values("episodes")
sns.barplot(y=cluster_episodes_anime['episodes'],x=cluster_episodes_anime['cluster'])
c0 = anime[anime['cluster'] == 0].groupby('rating')['rating'].count()
c1 = anime[anime['cluster'] == 1].groupby('rating')['rating'].count()
c2 = anime[anime['cluster'] == 2].groupby('rating')['rating'].count()
c3 = anime[anime['cluster'] == 3].groupby('rating')['rating'].count()
c4 = anime[anime['cluster'] == 4].groupby('rating')['rating'].count()
c5 = anime[anime['cluster'] == 5].groupby('rating')['rating'].count()
c6 = anime[anime['cluster'] == 6].groupby('rating')['rating'].count()
anime_cluster = pd.DataFrame(
{'Cluster 1': c0,
'Cluster 2': c1,
'Cluster 3': c2,
'Cluster 4': c3,
'Cluster 5': c4,
'Cluster 6': c5,
'Cluster 7': c6
})
cluster_genre_series = anime.groupby('cluster').sum()
anime_cluster
anime_cluster.plot(kind='bar')
c0_source = anime[anime['cluster'] == 0].groupby('source')['source'].count()
c1_source = anime[anime['cluster'] == 1].groupby('source')['source'].count()
c2_source = anime[anime['cluster'] == 2].groupby('source')['source'].count()
c3_source = anime[anime['cluster'] == 3].groupby('source')['source'].count()
c4_source = anime[anime['cluster'] == 4].groupby('source')['source'].count()
c5_source = anime[anime['cluster'] == 5].groupby('source')['source'].count()
c6_source = anime[anime['cluster'] == 6].groupby('source')['source'].count()
anime_cluster_source = pd.DataFrame(
{'Cluster 1': c0_source,
'Cluster 2': c1_source,
'Cluster 3': c2_source,
'Cluster 4': c3_source,
'Cluster 5': c4_source,
'Cluster 6': c5_source,
'Cluster 7': c6_source
})
anime_cluster_source
anime_cluster_source.plot(kind='bar')
genre = anime["genre"].str.get_dummies(sep=",")
genre['cluster'] = anime_kmean_sample_preds
cluster_genre_series = genre.groupby('cluster').sum()
cluster_genre_series = pd.DataFrame(cluster_genre_series)
cluster_genre_series = cluster_genre_series.transpose()
genre_c0 = cluster_genre_series.nlargest(5, 0)[0]
genre_c1 = cluster_genre_series.nlargest(5, 1)[1]
genre_c2 = cluster_genre_series.nlargest(5, 2)[2]
genre_c3 = cluster_genre_series.nlargest(5, 3)[3]
genre_c4 = cluster_genre_series.nlargest(5, 4)[4]
genre_c5 = cluster_genre_series.nlargest(5, 5)[5]
genre_c6 = cluster_genre_series.nlargest(5, 6)[6]
cluster_genre_series = pd.DataFrame({
'Cluster 1': genre_c0,
'Cluster 2': genre_c1,
'Cluster 3': genre_c2,
'Cluster 4': genre_c3,
'Cluster 5': genre_c4,
'Cluster 6': genre_c5,
'Cluster 7': genre_c6
})
cluster_genre_series_plot = cluster_genre_series.transpose()
cluster_genre_series_plot.plot(kind='bar')
O modelo relaciona pessoas com interesses similares e recomenda baseado nesse inter-relacionamento
### Preparação da Tabela para o Modelo
Não podemos usar o np.log pois criaria valores NaN, impossibilitando o PCA. Então estamos usando o MinMaxScaler.
anime_list = anime_list[anime_list['anime_id'].isin(anime['anime_id'])]
anime_list = anime_list[anime_list['username'].isin(users['username'])]
anime_list = anime_list.drop(['username'], axis = 1)
merge_data = pd.merge(anime_list,users,on=['user_id','user_id'])
merge_data.head()
users_features = pd.crosstab(merge_data['user_id'], merge_data['title'], values=merge_data.my_score, aggfunc='sum' )
users_features.head()
where_are_NaNs = np.isnan(users_features)
users_features[where_are_NaNs] = 0
min_max_scaler = MinMaxScaler()
users_features_scaled = min_max_scaler.fit_transform(users_features)
users_pca = PCA(n_components=3)
users_pca.fit(users_features_scaled)
users_pca_samples = users_pca.transform(users_features_scaled)
users_ps = pd.DataFrame(users_pca_samples)
users_ps.head(15)
users_tocluster = pd.DataFrame(users_ps[[0,1,2]])
plt.rcParams['figure.figsize'] = (16, 9)
fig = plt.figure()
ax = Axes3D(fig)
ax.scatter(users_tocluster[0], users_tocluster[2], users_tocluster[1])
plt.title('Data points in 3D PCA axis', fontsize=20)
plt.show()
users_GMM_results = []
components = 20
for n in range(2, components):
users_GMM_clusterer = mixture.GaussianMixture(n_components=n).fit(users_ps)
users_GMM_preds = users_GMM_clusterer.predict(users_ps)
users_GMM_centers = users_GMM_clusterer.means_
users_GMM_sample_preds = users_GMM_clusterer.predict(users_pca_samples)
users_GMM_results.append(silhouette_score(users_ps,users_GMM_preds))
print("Nº of clusters '{}', silhouette score is '{}' ".format(n,silhouette_score(users_ps,users_GMM_preds)))
plt.plot(range(2,20), users_GMM_results);
plt.title('Results GMM')
plt.xlabel('n_clusters');
plt.ylabel('Silhouette Score');
plt.axvline(x=5, color='blue', linestyle='--')
plt.axvline(x=9, color='blue', linestyle='--')
plt.show()
users_GMM_results_BIC = []
components = 20
for n in range(2, components):
users_GMM_clusterer = mixture.GaussianMixture(n_components=n).fit(users_ps)
users_GMM_preds = users_GMM_clusterer.predict(users_ps)
users_GMM_centers = users_GMM_clusterer.means_
users_GMM_sample_preds = users_GMM_clusterer.predict(users_pca_samples)
users_GMM_results_BIC.append(users_GMM_clusterer.bic(users_ps))
print("Nº of clusters '{}', silhouette score is '{}' ".format(n,users_GMM_clusterer.bic(users_ps)))
plt.plot(range(2,20), users_GMM_results_BIC);
plt.title('Results GMM')
plt.xlabel('n_clusters');
plt.ylabel('BIC');
plt.axvline(x=5, color='blue', linestyle='--')
plt.axvline(x=4, color='blue', linestyle='--')
plt.show()
users_GMM_clusterer = mixture.GaussianMixture(n_components=5).fit(users_ps)
users_GMM_preds = users_GMM_clusterer.predict(users_ps)
users_GMM_centers = users_GMM_clusterer.means_
users_GMM_sample_preds = users_GMM_clusterer.predict(users_pca_samples)
plt.rcParams['figure.figsize'] = (16, 9)
fig = plt.figure()
ax = Axes3D(fig)
ax.scatter(users_tocluster[0], users_tocluster[2], users_tocluster[1], c = users_GMM_sample_preds)
plt.title('Data points in 3D PCA axis', fontsize=20)
plt.show()
fig = plt.figure(figsize=(10,8))
plt.scatter(users_tocluster[1],users_tocluster[0],c = users_GMM_sample_preds)
for ci,c in enumerate(users_GMM_centers):
plt.plot(c[1], c[0], 'o', markersize=8, color='red', alpha=1)
plt.xlabel('x_values')
plt.ylabel('y_values')
plt.title('Data points in 2D PCA axis', fontsize=20)
plt.show()
users_kmeans_results = []
for i in range(2,20):
users_kmeans = KMeans(n_clusters=i)
users_kmeans.fit(users_ps)
users_kmeans_results.append(silhouette_score(users_ps, users_kmeans.labels_))
print("Nº of clusters '{}', silhouette score is '{}' ".format(i,silhouette_score(users_ps, users_kmeans.labels_)))
plt.plot(range(2,20), users_kmeans_results);
plt.title('Results KMeans')
plt.xlabel('n_clusters');
plt.axvline(x=4, color='blue', linestyle='--')
plt.axvline(x=10, color='blue', linestyle='--')
plt.ylabel('Silhouette Score');
plt.show()
users_kmean_results_elbow = []
for i in range(2,20):
users_kmeans = KMeans(n_clusters=i)
users_kmeans.fit(users_ps)
users_kmean_results_elbow.append(users_kmeans.inertia_)
print("Nº of clusters '{}', Elbow Method is '{}' ".format(i,users_kmeans.inertia_))
plt.plot(range(2,20), users_kmean_results_elbow);
plt.title('Results KMeans')
plt.xlabel('n_clusters');
plt.axvline(x=4, color='blue', linestyle='--')
plt.axvline(x=7, color='blue', linestyle='--')
plt.ylabel('Elbow Method');
plt.show()
users_kmeans_clusterer = KMeans(n_clusters=10,random_state=30).fit(users_tocluster)
users_kmeans_centers = users_kmeans_clusterer.cluster_centers_
users_kmeans_sample_preds = users_kmeans_clusterer.predict(users_tocluster)
plt.rcParams['figure.figsize'] = (16, 9)
fig = plt.figure()
ax = Axes3D(fig)
ax.scatter(users_tocluster[0], users_tocluster[2], users_tocluster[1], c = users_kmeans_sample_preds)
plt.title('Data points in 3D PCA axis', fontsize=20)
plt.show()
fig = plt.figure(figsize=(10,8))
plt.scatter(users_tocluster[1],users_tocluster[0],c = users_kmeans_sample_preds)
for ci,c in enumerate(users_kmeans_centers):
plt.plot(c[1], c[0], 'o', markersize=8, color='red', alpha=1)
plt.xlabel('x_values')
plt.ylabel('y_values')
plt.title('Data points in 2D PCA axis', fontsize=20)
plt.show()
users_kmeans_clusterer_2 = KMeans(n_clusters=4,random_state=30).fit(users_tocluster)
users_kmeans_centers_2 = users_kmeans_clusterer_2.cluster_centers_
users_kmeans_sample_preds_2 = users_kmeans_clusterer_2.predict(users_tocluster)
plt.rcParams['figure.figsize'] = (16, 9)
fig = plt.figure()
ax = Axes3D(fig)
ax.scatter(users_tocluster[0], users_tocluster[2], users_tocluster[1], c = users_kmeans_sample_preds_2)
plt.title('Data points in 3D PCA axis', fontsize=20)
plt.show()
fig = plt.figure(figsize=(10,8))
plt.scatter(users_tocluster[1],users_tocluster[0],c = users_kmeans_sample_preds_2)
for ci,c in enumerate(users_kmeans_centers_2):
plt.plot(c[1], c[0], 'o', markersize=8, color='red', alpha=1)
plt.xlabel('x_values')
plt.ylabel('y_values')
plt.title('Data points in 2D PCA axis', fontsize=20)
plt.show()
Podemos perceber que o modelo GMM não possui um desenpenho muito bom neste data set.
Logo o modelo KMeans é o mais adequado. Quantidade de klusters escolhido foi K =10, pois ele divide melhor os dados mais disperços. Podemos perceber similaridade entre os centros do K=4 e K = 10.
users_features['cluster'] = users_kmeans_sample_preds
c0_count = users_features[users_features['cluster'] == 0]
c0_count.head()
users_features['gender'] = merge_data['gender']
c0_gender = users_features[users_features['cluster'] == 0].groupby('gender')['gender'].count()
c1_gender = users_features[users_features['cluster'] == 1].groupby('gender')['gender'].count()
c2_gender = users_features[users_features['cluster'] == 2].groupby('gender')['gender'].count()
c3_gender = users_features[users_features['cluster'] == 3].groupby('gender')['gender'].count()
c4_gender = users_features[users_features['cluster'] == 4].groupby('gender')['gender'].count()
c5_gender = users_features[users_features['cluster'] == 5].groupby('gender')['gender'].count()
c6_gender = users_features[users_features['cluster'] == 6].groupby('gender')['gender'].count()
c7_gender = users_features[users_features['cluster'] == 7].groupby('gender')['gender'].count()
c8_gender = users_features[users_features['cluster'] == 8].groupby('gender')['gender'].count()
c9_gender = users_features[users_features['cluster'] == 9].groupby('gender')['gender'].count()
users_cluster_gender = pd.DataFrame(
{'Cluster 1': c0_gender,
'Cluster 2': c1_gender,
'Cluster 3': c2_gender,
'Cluster 4': c3_gender,
'Cluster 5': c4_gender,
'Cluster 6': c5_gender,
'Cluster 7': c6_gender,
'Cluster 8': c7_gender,
'Cluster 9': c8_gender,
'Cluster 10': c9_gender
})
users_cluster_gender = users_cluster_gender.transpose()
users_cluster_gender.plot(kind='bar')
# Retirado do Kernel https://www.kaggle.com/tanetboss/user-clustering-for-anime-recommendation/
def createAnimeInfoList(animelist):
episode_list = list()
genre_list = list()
member_list = list()
score_list= list()
for x in anime['title']:
if x in animelist:
episode_list.append(anime[anime['title']==x].episodes.values.astype(int))
member_list.append(anime[anime['title']==x].members.values.astype(int))
score_list.append(anime[anime['title']==x].score.values.astype(int))
for y in anime[anime['title']==x].genre.values:
genre_list.append(y)
# return genre_list
return genre_list,episode_list,score_list,member_list
def count_word(df, ref_col, liste):
keyword_count = dict()
#df[ref_col].apply(lambda x: x.replace(' ', ''))
for s in liste: keyword_count[s] = 0
for liste_keywords in df[ref_col].str.split(','):
if type(liste_keywords) == float and pd.isnull(liste_keywords): continue
for s in [s for s in liste_keywords if s in liste]:
if pd.notnull(s): keyword_count[s] += 1
#______________________________________________________________________
# convert the dictionary in a list to sort the keywords by frequency
keyword_occurences = []
for k,v in keyword_count.items():
keyword_occurences.append([k,v])
keyword_occurences.sort(key = lambda x:x[1], reverse = True)
return keyword_occurences, keyword_count
def makeCloud(Dict,name,color):
words = dict()
for s in Dict:
words[s[0]] = s[1]
wordcloud = WordCloud(
width=1500,
height=500,
background_color=color,
max_words=20,
max_font_size=500,
normalize_plurals=False)
wordcloud.generate_from_frequencies(words)
fig = plt.figure(figsize=(12, 8))
plt.title(name)
plt.imshow(wordcloud)
plt.axis('off')
plt.show()
set_keywords = set()
for liste_keywords in data_c0['genre'].str.split(',').values:
if isinstance(liste_keywords, float): continue # only happen if liste_keywords = NaN
set_keywords = set_keywords.union(liste_keywords)
users_features['cluster'] = users_kmeans_sample_preds
c0_users = users_features[users_features['cluster']==0].drop('cluster',axis=1).mean()
c1_users = users_features[users_features['cluster']==1].drop('cluster',axis=1).mean()
c2_users = users_features[users_features['cluster']==2].drop('cluster',axis=1).mean()
c3_users = users_features[users_features['cluster']==3].drop('cluster',axis=1).mean()
c4_users = users_features[users_features['cluster']==4].drop('cluster',axis=1).mean()
c5_users = users_features[users_features['cluster']==5].drop('cluster',axis=1).mean()
c6_users = users_features[users_features['cluster']==6].drop('cluster',axis=1).mean()
c7_users = users_features[users_features['cluster']==7].drop('cluster',axis=1).mean()
c8_users = users_features[users_features['cluster']==8].drop('cluster',axis=1).mean()
c9_users = users_features[users_features['cluster']==9].drop('cluster',axis=1).mean()
animelist_c0 = list(c0_users.sort_values(ascending=False)[0:15].index)
animelist_c1 = list(c1_users.sort_values(ascending=False)[0:15].index)
animelist_c2 = list(c2_users.sort_values(ascending=False)[0:15].index)
animelist_c3 = list(c3_users.sort_values(ascending=False)[0:15].index)
animelist_c4 = list(c4_users.sort_values(ascending=False)[0:15].index)
animelist_c5 = list(c5_users.sort_values(ascending=False)[0:15].index)
animelist_c6 = list(c6_users.sort_values(ascending=False)[0:15].index)
animelist_c7 = list(c7_users.sort_values(ascending=False)[0:15].index)
animelist_c8 = list(c8_users.sort_values(ascending=False)[0:15].index)
animelist_c9 = list(c9_users.sort_values(ascending=False)[0:15].index)
data_c0 = pd.DataFrame()
data_c0['genre'], data_c0['episode'], data_c0['score'], data_c0['member']= createAnimeInfoList(animelist_c0)
data_c0.iloc[:,1:4] = data_c0.iloc[:,1:4].astype(int)
data_c1 = pd.DataFrame()
data_c1['genre'], data_c1['episode'], data_c1['score'], data_c1['member'] = createAnimeInfoList(animelist_c1)
data_c1.iloc[:,1:4] = data_c1.iloc[:,1:4].astype(int)
data_c2 = pd.DataFrame()
data_c2['genre'], data_c2['episode'], data_c2['score'], data_c2['member'] = createAnimeInfoList(animelist_c2)
data_c2.iloc[:,1:4] = data_c2.iloc[:,1:4].astype(int)
data_c3 = pd.DataFrame()
data_c3['genre'], data_c3['episode'], data_c3['score'], data_c3['member'] = createAnimeInfoList(animelist_c3)
data_c3.iloc[:,1:4] = data_c3.iloc[:,1:4].astype(int)
data_c4 = pd.DataFrame()
data_c4['genre'], data_c4['episode'], data_c4['score'], data_c4['member'] = createAnimeInfoList(animelist_c4)
data_c4.iloc[:,1:4] = data_c4.iloc[:,1:4].astype(int)
data_c5 = pd.DataFrame()
data_c5['genre'], data_c5['episode'], data_c5['score'], data_c5['member'] = createAnimeInfoList(animelist_c5)
data_c5.iloc[:,1:4] = data_c5.iloc[:,1:4].astype(int)
data_c6 = pd.DataFrame()
data_c6['genre'], data_c6['episode'], data_c6['score'], data_c6['member'] = createAnimeInfoList(animelist_c6)
data_c6.iloc[:,1:4] = data_c6.iloc[:,1:4].astype(int)
data_c7 = pd.DataFrame()
data_c7['genre'], data_c7['episode'], data_c7['score'], data_c7['member'] = createAnimeInfoList(animelist_c7)
data_c7.iloc[:,1:4] = data_c7.iloc[:,1:4].astype(int)
data_c8 = pd.DataFrame()
data_c8['genre'], data_c8['episode'], data_c8['score'], data_c8['member'] = createAnimeInfoList(animelist_c8)
data_c8.iloc[:,1:4] = data_c8.iloc[:,1:4].astype(int)
data_c9 = pd.DataFrame()
data_c9['genre'], data_c9['episode'], data_c9['score'], data_c9['member'] = createAnimeInfoList(animelist_c9)
data_c9.iloc[:,1:4] = data_c9.iloc[:,1:4].astype(int)
cluster_info_users = pd.DataFrame(np.array(
[[data_c0['episode'].mean(), data_c0['score'].mean(),data_c0['member'].mean()],
[data_c1['episode'].mean(), data_c1['score'].mean(),data_c1['member'].mean()],
[data_c2['episode'].mean(), data_c2['score'].mean(),data_c2['member'].mean()],
[data_c3['episode'].mean(), data_c3['score'].mean(),data_c3['member'].mean()],
[data_c4['episode'].mean(), data_c4['score'].mean(),data_c4['member'].mean()],
[data_c5['episode'].mean(), data_c5['score'].mean(),data_c5['member'].mean()],
[data_c6['episode'].mean(), data_c6['score'].mean(),data_c6['member'].mean()],
[data_c7['episode'].mean(), data_c7['score'].mean(),data_c7['member'].mean()],
[data_c8['episode'].mean(), data_c8['score'].mean(),data_c8['member'].mean()],
[data_c9['episode'].mean(), data_c9['score'].mean(),data_c9['member'].mean()]]),
columns=['AVG episode ', 'AVG Score', 'AVG Member'])
cluster_info_users['AVG episode '].plot.bar()
cluster_info_users['AVG Score'].plot.bar()
cluster_info_users['AVG Member'].plot.bar()
keyword_occurences_c1, dum = count_word(data_c1, 'genre', set_keywords)
keyword_occurences_c2, dum = count_word(data_c2, 'genre', set_keywords)
keyword_occurences_c3, dum = count_word(data_c3, 'genre', set_keywords)
keyword_occurences_c4, dum = count_word(data_c4, 'genre', set_keywords)
keyword_occurences_c5, dum = count_word(data_c5, 'genre', set_keywords)
keyword_occurences_c6, dum = count_word(data_c6, 'genre', set_keywords)
keyword_occurences_c7, dum = count_word(data_c7, 'genre', set_keywords)
keyword_occurences_c8, dum = count_word(data_c8, 'genre', set_keywords)
keyword_occurences_c9, dum = count_word(data_c9, 'genre', set_keywords)
keyword_occurences_c0, dum = count_word(data_c0, 'genre', set_keywords)
cluster_genre_users = pd.DataFrame({
'Cluster 1': keyword_occurences_c0,
'Cluster 2': keyword_occurences_c1,
'Cluster 3': keyword_occurences_c2,
'Cluster 4': keyword_occurences_c3,
'Cluster 5': keyword_occurences_c4,
'Cluster 6': keyword_occurences_c5,
'Cluster 7': keyword_occurences_c6,
'Cluster 8': keyword_occurences_c7,
'Cluster 9': keyword_occurences_c8,
'Cluster 10': keyword_occurences_c9
})
cluster_genre_users[:10]
makeCloud(keyword_occurences_c0[0:10],"cluster 0","black")
makeCloud(keyword_occurences_c1[0:10],"cluster 1","black")
makeCloud(keyword_occurences_c2[0:10],"cluster 2","black")
makeCloud(keyword_occurences_c3[0:10],"cluster 3","black")
makeCloud(keyword_occurences_c4[0:10],"cluster 4","black")
makeCloud(keyword_occurences_c5[0:10],"cluster 5","black")
makeCloud(keyword_occurences_c6[0:10],"cluster 6","black")
makeCloud(keyword_occurences_c7[0:10],"cluster 7","black")
makeCloud(keyword_occurences_c8[0:10],"cluster 8","black")
makeCloud(keyword_occurences_c9[0:10],"cluster 9","black")
Estamos lidando com tema subjetivo( Gostar ou não gostar de algo), não existe métricas concretas para serem utilizadas, por isso plataformas como no Netflix, Youtube, entre outros, acaba existindo várias "áreas" com sugestões baseadas em diferentes critérios, como por exemplo:
- Popularidade
- Similaridade com a última produção que você assistiu
- Novidades/ Produtos Novos ou Promoções Pagas
- Filtro Colaborativo
- Filtro Demográfico
- Hibridos
Neste trabalho foram criadas 2 dessas "áreas", Filtro Colaborativo e o Filtro Demográfico. Usamos neste trabalho as métricas BIC e Silhouette Score para o algoritmo não supervisionado GMM e as métricas "Elbow Method" e Silhouette Score para o algoritmo de não supervisionado Kmeans.
Essas métricas são apenas para identificar melhor a: quantidade de grupos e seus membros, não para afirmar que o usuário vai gostar do produto apontado.